First we have to download the data.
## Rows: 36,283
## Columns: 106
## $ id <dbl> 44054, 100213, 114384, 1…
## $ listing_url <chr> "https://www.airbnb.com/…
## $ scrape_id <dbl> 2.02e+13, 2.02e+13, 2.02…
## $ last_scraped <date> 2020-06-20, 2020-06-20,…
## $ name <chr> "Modern and Comfortable …
## $ summary <chr> "East Apartments offers …
## $ space <chr> "East Apartments is a we…
## $ description <chr> "East Apartments offers …
## $ experiences_offered <chr> "none", "none", "none", …
## $ neighborhood_overview <chr> "The neighborhood is a p…
## $ notes <chr> "*For long term reservat…
## $ transit <chr> "The easiest method to g…
## $ access <chr> "*Guests have access to …
## $ interaction <chr> NA, NA, "Helen和Wendy会全程为…
## $ house_rules <chr> "Registration All guests…
## $ thumbnail_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ medium_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ picture_url <chr> "https://a0.muscache.com…
## $ xl_picture_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ host_id <dbl> 192875, 527062, 533062, …
## $ host_url <chr> "https://www.airbnb.com/…
## $ host_name <chr> "East Apartments", "Joe"…
## $ host_since <date> 2010-08-06, 2011-04-22,…
## $ host_location <chr> "Beijing, Beijing, China…
## $ host_about <chr> "Hi everyone! My name i…
## $ host_response_time <chr> "within an hour", "N/A",…
## $ host_response_rate <chr> "100%", "N/A", "100%", "…
## $ host_acceptance_rate <chr> "95%", "N/A", "100%", "1…
## $ host_is_superhost <lgl> FALSE, FALSE, FALSE, FAL…
## $ host_thumbnail_url <chr> "https://a0.muscache.com…
## $ host_picture_url <chr> "https://a0.muscache.com…
## $ host_neighbourhood <chr> "Shuangjing", NA, "ITC",…
## $ host_listings_count <dbl> 5, 4, 5, 5, 1, 7, 7, 6, …
## $ host_total_listings_count <dbl> 5, 4, 5, 5, 1, 7, 7, 6, …
## $ host_verifications <chr> "['email', 'phone', 'fac…
## $ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ host_identity_verified <lgl> FALSE, FALSE, FALSE, FAL…
## $ street <chr> "Beijing, Beijing, China…
## $ neighbourhood <chr> "Chaoyang", NA, "ITC", "…
## $ neighbourhood_cleansed <chr> "朝阳区 / Chaoyang", "密云县 /…
## $ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA, …
## $ city <chr> "Beijing", "Beijing", "B…
## $ state <chr> "Beijing", "Beijing", "B…
## $ zipcode <dbl> 100022, 101508, NA, 1000…
## $ market <chr> "Beijing", "Other (Inter…
## $ smart_location <chr> "Beijing, China", "Beiji…
## $ country_code <chr> "CN", "CN", "CN", "CN", …
## $ country <chr> "China", "China", "China…
## $ latitude <dbl> 39.9, 40.7, 39.9, 39.9, …
## $ longitude <dbl> 116, 117, 116, 116, 116,…
## $ is_location_exact <lgl> TRUE, TRUE, TRUE, FALSE,…
## $ property_type <chr> "Serviced apartment", "G…
## $ room_type <chr> "Entire home/apt", "Priv…
## $ accommodates <dbl> 9, 2, 2, 2, 3, 2, 4, 2, …
## $ bathrooms <dbl> 2, 1, 1, 1, 1, 1, 1, 1, …
## $ bedrooms <dbl> 3, 1, 1, 1, 1, 1, 1, 1, …
## $ beds <dbl> 4, 1, 1, 1, 2, 1, 2, 1, …
## $ bed_type <chr> "Real Bed", "Real Bed", …
## $ amenities <chr> "{TV,\"Cable TV\",Intern…
## $ square_feet <dbl> 1464, NA, NA, NA, 323, N…
## $ price <chr> "$835.00", "$1,203.00", …
## $ weekly_price <chr> "$8,373.00", "$7,200.00"…
## $ monthly_price <chr> "$27,603.00", "$28,800.0…
## $ security_deposit <chr> "$708.00", "$0.00", NA, …
## $ cleaning_fee <chr> "$71.00", "$0.00", NA, "…
## $ guests_included <dbl> 6, 1, 1, 1, 2, 1, 1, 2, …
## $ extra_people <chr> "$71.00", "$0.00", "$0.0…
## $ minimum_nights <dbl> 2, 1, 1, 1, 3, 1, 1, 1, …
## $ maximum_nights <dbl> 365, 30, 730, 1125, 365,…
## $ minimum_minimum_nights <dbl> 2, 1, 1, 1, 3, 1, 1, 1, …
## $ maximum_minimum_nights <dbl> 2, 1, 1, 1, 3, 1, 1, 1, …
## $ minimum_maximum_nights <dbl> 365, 30, 730, 1125, 365,…
## $ maximum_maximum_nights <dbl> 365, 30, 730, 1125, 365,…
## $ minimum_nights_avg_ntm <dbl> 2, 1, 1, 1, 3, 1, 1, 1, …
## $ maximum_nights_avg_ntm <dbl> 365, 30, 730, 1125, 365,…
## $ calendar_updated <chr> "5 months ago", "27 mont…
## $ has_availability <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ availability_30 <dbl> 19, 0, 19, 19, 19, 2, 0,…
## $ availability_60 <dbl> 49, 0, 49, 49, 49, 2, 0,…
## $ availability_90 <dbl> 79, 0, 79, 79, 79, 2, 0,…
## $ availability_365 <dbl> 354, 0, 354, 354, 169, 2…
## $ calendar_last_scraped <date> 2020-06-20, 2020-06-20,…
## $ number_of_reviews <dbl> 99, 2, 66, 10, 290, 26, …
## $ number_of_reviews_ltm <dbl> 7, 0, 1, 1, 22, 0, 2, 0,…
## $ first_review <date> 2010-08-25, 2017-08-27,…
## $ last_review <date> 2020-01-06, 2017-10-08,…
## $ review_scores_rating <dbl> 91, 100, 93, 98, 97, 77,…
## $ review_scores_accuracy <dbl> 9, 10, 10, 9, 10, 8, 8, …
## $ review_scores_cleanliness <dbl> 8, 9, 9, 9, 10, 7, 7, 8,…
## $ review_scores_checkin <dbl> 10, 10, 10, 9, 10, 9, 9,…
## $ review_scores_communication <dbl> 10, 10, 10, 10, 10, 9, 9…
## $ review_scores_location <dbl> 10, 9, 10, 10, 10, 9, 9,…
## $ review_scores_value <dbl> 9, 9, 10, 9, 10, 8, 9, 8…
## $ requires_license <lgl> FALSE, FALSE, FALSE, FAL…
## $ license <chr> NA, NA, "Exempt", "Exemp…
## $ jurisdiction_names <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable <lgl> FALSE, TRUE, TRUE, TRUE,…
## $ is_business_travel_ready <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy <chr> "strict_14_with_grace_pe…
## $ require_guest_profile_picture <lgl> FALSE, FALSE, FALSE, FAL…
## $ require_guest_phone_verification <lgl> FALSE, FALSE, FALSE, FAL…
## $ calculated_host_listings_count <dbl> 5, 4, 5, 5, 1, 5, 5, 6, …
## $ calculated_host_listings_count_entire_homes <dbl> 5, 0, 5, 5, 1, 5, 5, 5, …
## $ calculated_host_listings_count_private_rooms <dbl> 0, 3, 0, 0, 0, 0, 0, 1, …
## $ calculated_host_listings_count_shared_rooms <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month <dbl> 0.83, 0.06, 0.73, 0.11, …
From this output we can see that we have + just over 36 thousand observations (or Airbnb listings) in Beijing in the data set + 106 different variables included in the data + these variables are a mixture of ‘double’, ‘character’, ‘logic’ and ‘date’ + straightaway we can see that some of our ‘price’ variables include dollar signs ($) and are down as ‘character’ variables rather than ‘double’ variables
Since this is a large data set with a lot going on, we will compute some summary statistics on key variables
listings <- data %>%
#Lets pick the variables we need
select(c(price,
cleaning_fee,
extra_people,
room_type,
property_type,
number_of_reviews,
review_scores_rating,
longitude,
latitude,
neighbourhood,
minimum_nights,
guests_included,
bathrooms,
bedrooms,
beds,
accommodates,
host_is_superhost,
neighbourhood_cleansed,
cancellation_policy,
listing_url,
is_location_exact,
security_deposit,
review_scores_cleanliness,
instant_bookable,
amenities,
calculated_host_listings_count,
reviews_per_month
)
) %>%
#Removing dollar signs and changing into numerical variables
mutate(
#Changing Price from chr to dbl
price = parse_number(price),
#Changing Cleaning Fee from chr to dbl
cleaning_fee = parse_number(cleaning_fee),
#Changing Extra People fee from chr to dbl
extra_people = parse_number(extra_people),
#Changing Security Deposit from chr to dbl
security_deposit = parse_number(security_deposit)
)Now that we have all the variables in the format required, we can move on to the quality of the data.
# Check which variables have lots of missing values (NA's)
listings %>%
skim() %>%
kbl() %>%
kable_styling()| skim_type | skim_variable | n_missing | complete_rate | character.min | character.max | character.empty | character.n_unique | character.whitespace | logical.mean | logical.count | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| character | room_type | 0 | 1.000 | 11 | 15 | 0 | 3 | 0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| character | property_type | 0 | 1.000 | 3 | 22 | 0 | 45 | 0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| character | neighbourhood | 13370 | 0.632 | 3 | 36 | 0 | 61 | 0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| character | neighbourhood_cleansed | 0 | 1.000 | 3 | 16 | 0 | 16 | 0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| character | cancellation_policy | 0 | 1.000 | 8 | 27 | 0 | 3 | 0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| character | listing_url | 0 | 1.000 | 34 | 37 | 0 | 36283 | 0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| character | amenities | 0 | 1.000 | 2 | 1917 | 0 | 28222 | 0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| logical | host_is_superhost | 1 | 1.000 | NA | NA | NA | NA | NA | 0.264 | FAL: 26711, TRU: 9571 | NA | NA | NA | NA | NA | NA | NA | NA |
| logical | is_location_exact | 0 | 1.000 | NA | NA | NA | NA | NA | 0.565 | TRU: 20497, FAL: 15786 | NA | NA | NA | NA | NA | NA | NA | NA |
| logical | instant_bookable | 0 | 1.000 | NA | NA | NA | NA | NA | 0.643 | TRU: 23333, FAL: 12950 | NA | NA | NA | NA | NA | NA | NA | NA |
| numeric | price | 0 | 1.000 | NA | NA | NA | NA | NA | NA | NA | 726.046 | 1861.040 | 0.00 | 255.00 | 396.00 | 651.00 | 70723.0 | ▇▁▁▁▁ |
| numeric | cleaning_fee | 23123 | 0.363 | NA | NA | NA | NA | NA | NA | NA | 60.943 | 218.669 | 0.00 | 0.00 | 40.00 | 70.00 | 10000.0 | ▇▁▁▁▁ |
| numeric | extra_people | 0 | 1.000 | NA | NA | NA | NA | NA | NA | NA | 20.474 | 79.101 | 0.00 | 0.00 | 0.00 | 0.00 | 2118.0 | ▇▁▁▁▁ |
| numeric | number_of_reviews | 0 | 1.000 | NA | NA | NA | NA | NA | NA | NA | 6.752 | 16.834 | 0.00 | 0.00 | 1.00 | 5.00 | 344.0 | ▇▁▁▁▁ |
| numeric | review_scores_rating | 16270 | 0.552 | NA | NA | NA | NA | NA | NA | NA | 94.789 | 10.836 | 20.00 | 94.00 | 100.00 | 100.00 | 100.0 | ▁▁▁▁▇ |
| numeric | longitude | 0 | 1.000 | NA | NA | NA | NA | NA | NA | NA | 116.442 | 0.258 | 115.47 | 116.34 | 116.43 | 116.50 | 117.5 | ▁▁▇▁▁ |
| numeric | latitude | 0 | 1.000 | NA | NA | NA | NA | NA | NA | NA | 40.022 | 0.235 | 39.46 | 39.90 | 39.94 | 40.05 | 41.0 | ▁▇▁▂▁ |
| numeric | minimum_nights | 0 | 1.000 | NA | NA | NA | NA | NA | NA | NA | 4.308 | 28.307 | 1.00 | 1.00 | 1.00 | 1.00 | 1086.0 | ▇▁▁▁▁ |
| numeric | guests_included | 0 | 1.000 | NA | NA | NA | NA | NA | NA | NA | 1.365 | 1.257 | 1.00 | 1.00 | 1.00 | 1.00 | 16.0 | ▇▁▁▁▁ |
| numeric | bathrooms | 21 | 0.999 | NA | NA | NA | NA | NA | NA | NA | 1.424 | 1.375 | 0.00 | 1.00 | 1.00 | 1.50 | 101.5 | ▇▁▁▁▁ |
| numeric | bedrooms | 142 | 0.996 | NA | NA | NA | NA | NA | NA | NA | 1.663 | 1.480 | 0.00 | 1.00 | 1.00 | 2.00 | 50.0 | ▇▁▁▁▁ |
| numeric | beds | 380 | 0.990 | NA | NA | NA | NA | NA | NA | NA | 2.242 | 2.754 | 0.00 | 1.00 | 1.00 | 2.00 | 115.0 | ▇▁▁▁▁ |
| numeric | accommodates | 0 | 1.000 | NA | NA | NA | NA | NA | NA | NA | 3.742 | 3.090 | 1.00 | 2.00 | 2.00 | 4.00 | 18.0 | ▇▁▁▁▁ |
| numeric | security_deposit | 23793 | 0.344 | NA | NA | NA | NA | NA | NA | NA | 655.045 | 2337.306 | 0.00 | 0.00 | 200.00 | 700.00 | 35362.0 | ▇▁▁▁▁ |
| numeric | review_scores_cleanliness | 16272 | 0.552 | NA | NA | NA | NA | NA | NA | NA | 9.518 | 1.065 | 2.00 | 9.00 | 10.00 | 10.00 | 10.0 | ▁▁▁▁▇ |
| numeric | calculated_host_listings_count | 0 | 1.000 | NA | NA | NA | NA | NA | NA | NA | 9.543 | 13.636 | 1.00 | 2.00 | 5.00 | 11.00 | 89.0 | ▇▁▁▁▁ |
| numeric | reviews_per_month | 15644 | 0.569 | NA | NA | NA | NA | NA | NA | NA | 0.649 | 0.850 | 0.01 | 0.14 | 0.31 | 0.81 | 22.9 | ▇▁▁▁▁ |
Here we can see that
has an extremely high number of missing values or values. This is most likely due to some properties including a cleaning fee within the price, and then not listing the cleaning fee as ‘$0’. A similar issue arises with security deposit. + In consumer psychology, additional costs are often viewed negatively
data_cleaned <- listings %>%
# In order to handle the high volume of NA's in cleaning_fee, we will change these values to a 0
mutate(
cleaning_fee = case_when(
is.na(cleaning_fee) ~ 0,
TRUE ~ cleaning_fee
),
# We apply the same logic to the security_deposit variable
security_deposit = case_when(
is.na(security_deposit) ~ 0,
TRUE ~ security_deposit
),
reviews_per_month = case_when(
is.na(reviews_per_month) ~0,
TRUE ~ reviews_per_month
),
wifi = case_when(
str_detect(amenities, "Wifi") ~ TRUE,
str_detect(amenities, "wifi") ~ TRUE,
TRUE ~ FALSE
),
breakfast = case_when(
str_detect(amenities, "Breakfast") ~ TRUE,
str_detect(amenities, "breakfast") ~ TRUE,
TRUE ~ FALSE
)
)
# lets examine wifi and breakfast columns
data_cleaned %>%
select(c(price, wifi, breakfast))## # A tibble: 36,283 x 3
## price wifi breakfast
## <dbl> <lgl> <lgl>
## 1 835 TRUE FALSE
## 2 1203 TRUE TRUE
## 3 602 TRUE FALSE
## 4 602 TRUE FALSE
## 5 411 TRUE TRUE
## 6 552 TRUE FALSE
## 7 601 TRUE FALSE
## 8 403 TRUE FALSE
## 9 743 TRUE FALSE
## 10 418 TRUE FALSE
## # … with 36,273 more rows
# Let's skim the cleaning_fee variable to see if we have succeeded
data_cleaned %>%
skim(cleaning_fee) %>%
# the kable package is used to format the resulting tables in a more visually appealing way
kbl() %>%
kable_styling()| skim_type | skim_variable | n_missing | complete_rate | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| numeric | cleaning_fee | 0 | 1 | 22.1 | 135 | 0 | 0 | 0 | 0 | 10000 | ▇▁▁▁▁ |
# Using patchwork to create a visualization of density for all numerical variables
p1 <- ggplot(data = data_cleaned, aes(x = price)) +
geom_density() +
theme_bw()Before creating plots for all other numerical variables, let’s check the readability
#Some of the x-axis for the data mean that it is difficult to get a full picture of the variability in the variables
p1a <- ggplot(data = data_cleaned, aes(x = price)) +
geom_density() +
#Here we add a limit to the x-axis, where the maximum value is 10000. We add this to most of the plots, where necessary
xlim(0, 10000) +
theme_bw()
p2a <- ggplot(data = data_cleaned, aes(x = cleaning_fee)) +
geom_density() +
xlim(0, 300) +
theme_bw()
p3a <- ggplot(data = data_cleaned, aes(x = guests_included)) +
geom_density() +
xlim(0, 8) +
theme_bw()
p4a <- ggplot(data = data_cleaned, aes(x = extra_people)) +
geom_density() +
xlim(0, 400) +
theme_bw()
p5a <- ggplot(data = data_cleaned, aes(x = number_of_reviews)) +
geom_density() +
xlim(0, 100) +
theme_bw()
p6a <- ggplot(data = data_cleaned, aes(x = review_scores_rating)) +
geom_density() +
xlim(0, 100) +
theme_bw()
p7a <- ggplot(data = data_cleaned, aes(x = minimum_nights)) +
geom_density() +
xlim(0, 150) +
theme_bw()
p8a <- ggplot(data = data_cleaned, aes(x = accommodates)) +
geom_density() +
theme_bw()
p9a <- ggplot(data = data_cleaned, aes(x = beds)) +
geom_density() +
xlim(0, 20) +
theme_bw()
p10a <- ggplot(data = data_cleaned, aes(x = bathrooms)) +
geom_density() +
xlim(0, 20) +
theme_bw()
p11a <- ggplot(data = data_cleaned, aes(x = bedrooms)) +
geom_density() +
xlim(0, 15) +
theme_bw()
p1a + p2a + p3a + p4a + p5a + p6a + p7a + p8a + p9a + p10a + p11a# using ggpairs to plot a correlation matrix
data_cleaned %>%
select(c(price, cleaning_fee, guests_included,
extra_people, number_of_reviews, review_scores_rating,
minimum_nights, accommodates, beds, bathrooms, bedrooms, security_deposit)
) %>%
ggpairs() > Notable correlations with price are: 1. Accomodates (number of people the listing can accomodate) 2. Bedrooms (number of bedrooms at the listing) 3. Bathrooms (number of bathrooms at the listing) 4. Beds (number of beds at the listing) 5. Cleaning fee (additional flat cleaning fee) 6. Guests included (number of guests included in the price and exempt from
Notable correlations between variables: 1. Accomodates/Beds/Bathrooms/Bedrooms/ - the greater the number of rooms, the greater the number of guests it can accommodate
These plots demonstrate????
Some of the character variables have lots of different values, e.g.
data_cleaned %>%
# Counting the frequency of property types
count(property_type) %>%
# Arranging them into descending order by frequency
arrange(desc(n))## # A tibble: 45 x 2
## property_type n
## <chr> <int>
## 1 Apartment 14428
## 2 Condominium 4761
## 3 House 4129
## 4 Loft 2960
## 5 Serviced apartment 2189
## 6 Farm stay 1330
## 7 Villa 1222
## 8 Bungalow 985
## 9 Cottage 596
## 10 Townhouse 513
## # … with 35 more rows
We’re now classifying different types of properties into 5 groups - the 4 most prominent ones and remaining smaller categories labeled as ‘Other’.
cleaning <- data_cleaned %>%
# creating a new variable 'prop_type_simplified' that groups property types into one of 5 categories. For example, "Boutique hotel" will now come under "Other"
mutate(prop_type_simplified = case_when(
# Here we specify that if property_type is equal to the top 4 types, then we pass through the property_type value
property_type %in% c("Apartment","Condominium", "House","Loft") ~ property_type,
# This specifies that if the property_type value doesn't meet this criteria, the new variable will equal 'Other
TRUE ~ "Other"
))Now that our categorical variables are cleaned, we can inspect the variability as we did with the numerical variables, this time using bar plots. Plotting property types, room types, super host status and cancellation policy, to analyse their distribution.
# Simple ggplot code specifying x variable, visualisation type and theme
# using patchwork to plot distribution of different variables
p12 <- ggplot(data = cleaning, aes(x = prop_type_simplified)) +
geom_bar() +
theme_bw()
p13 <- ggplot(data = cleaning, aes(x = room_type)) +
geom_bar() +
theme_bw()
p14 <- ggplot(data = cleaning, aes(x = host_is_superhost)) +
geom_bar() +
theme_bw()
p15 <- ggplot(data = cleaning, aes(x = cancellation_policy)) +
geom_bar() +
theme_bw()
# Using patchwork to create a clean grid of the bar plots
p12 + p13 + p14 + p15commentary needed on bar plots
#Here we can explore the correlation between our numerical variables
data_numerical <- data_cleaned %>%
#First we select the variables we want to plot against each other
select(c(price, cleaning_fee, guests_included, extra_people, number_of_reviews, review_scores_rating, minimum_nights,
accommodates, beds, bathrooms, bedrooms)) %>%
#Next we use the ggpairs function to plot a grid of scatter plots with correlation coefficients
ggpairs()
data_numerical > Notable correlations with price are: 1. Accomodates (number of people the listing can accomodate) 1. Bedrooms (number of bedrooms at the listing) 1. Bathrooms (number of bathrooms at the listing) 1. Beds (number of beds at the listing) 1. Cleaning fee (additional flat cleaning fee) 1. Guests included (number of guests included in the price and exempt from
Notable correlations between variables: 1. Accomodates/Beds/Bathrooms/Bedrooms/ - this makes sense because…???? 1.
As we are looking at data over a geographical region, it can be helpful to see the geospatial spread of the Airbnb listings. Here we use the leaflet package to map our longitude and latitude data onto a map.
# Using the leaflet package
leaflet(data = filter(cleaning, minimum_nights <= 4)) %>%
# Adding the map to lie beneath the data points
addProviderTiles("OpenStreetMap.Mapnik") %>%
# Adding our listing data as points on the map
addCircleMarkers(lng = ~longitude,
lat = ~latitude,
radius = 1,
fillColor = "blue",
fillOpacity = 0.4,
popup = ~listing_url,
label = ~property_type)In order to run a regression model, we will transform our price data into a approximately ‘normal’ distribution.
# We want to use log to transform our data into a more normal looking distribution of data, let's first see how the distribution would look
cleaning %>%
ggplot() +
geom_density(aes(x = minimum_nights)) +
# Use this to transform the x-axis by log10
scale_x_log10()## # A tibble: 36,283 x 30
## price cleaning_fee extra_people room_type property_type number_of_revie…
## <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 835 71 71 Entire h… Serviced apa… 99
## 2 1203 0 0 Private … Guest suite 2
## 3 602 0 0 Entire h… Apartment 66
## 4 602 30 0 Entire h… Apartment 10
## 5 411 71 106 Entire h… House 290
## 6 552 0 0 Entire h… Apartment 26
## 7 601 0 0 Entire h… Apartment 39
## 8 403 0 64 Entire h… Apartment 30
## 9 743 283 0 Entire h… Apartment 117
## 10 418 35 80 Entire h… Apartment 3
## # … with 36,273 more rows, and 24 more variables: review_scores_rating <dbl>,
## # longitude <dbl>, latitude <dbl>, neighbourhood <chr>, minimum_nights <dbl>,
## # guests_included <dbl>, bathrooms <dbl>, bedrooms <dbl>, beds <dbl>,
## # accommodates <dbl>, host_is_superhost <lgl>, neighbourhood_cleansed <chr>,
## # cancellation_policy <chr>, listing_url <chr>, is_location_exact <lgl>,
## # security_deposit <dbl>, review_scores_cleanliness <dbl>,
## # instant_bookable <lgl>, amenities <chr>,
## # calculated_host_listings_count <dbl>, reviews_per_month <dbl>, wifi <lgl>,
## # breakfast <lgl>, prop_type_simplified <chr>
As we are looking to model the price of an Airbnb in Beijing for travel/tourism, we should look into the minimum_nights variable. This variable states the minimum number of nights you are able to to book the listing for.
# Visualise the frequency of minimum nights
# arranging listings by minimum_nights
cleaning %>%
count(minimum_nights) %>%
# Arrange in descending order of frequency
arrange(desc(n))## # A tibble: 66 x 2
## minimum_nights n
## <dbl> <int>
## 1 1 30216
## 2 2 2178
## 3 3 1024
## 4 30 819
## 5 7 369
## 6 5 368
## 7 15 316
## 8 90 175
## 9 10 161
## 10 60 89
## # … with 56 more rows
# calculating summary statistics for the distribution of minimum_nights
favstats(data = cleaning , ~ minimum_nights) %>%
kbl() %>%
kable_styling()| min | Q1 | median | Q3 | max | mean | sd | n | missing | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | 1 | 1086 | 4.31 | 28.3 | 36283 | 0 |
From the above, we can infer the following - + The most common values for ‘minimum nights’ are 1 to 3 nights as they account for 92.1% of total listings. The next biggest category is ‘30 minimum nights’ (2.26% of total listings) + 30 minimum nights seem rather strange - maybe the people booking the Airbnbs are visiting Beijing for reasons other than leisure/ travel. For example, they may prefer Airbnbs as a budget friendly alternative to hotels for longer stays intended for business-related work, etc. + There are 61 listings for minimum nights of 365 days (1 year) as well which implies that some Airbnbs are more for the purpose of long-term renting or sub-letting.
neighbourhoodring <- vroom::vroom("neighbourhoodring.csv")
regression_data <- cleaning %>%
# filter for minimum nights at most 4
filter(minimum_nights<=4) %>%
# New variable that computes the price of 2 people booking an Airbnb for 4 nights
# Note: extra_people charge per 1 extra person applied per night when no. of guests > guests_included
left_join(., neighbourhoodring, by = "neighbourhood", copy = TRUE) %>%
mutate(price_for_4_notlog = case_when(
guests_included < 2 ~ cleaning_fee + (4 * (price + extra_people)),
TRUE ~ cleaning_fee + (4 * price)
),
price_4_nights = log(price_for_4_notlog + 0.00001),
#New variable that classifies neighborhood into 5 areas according to Beijing's geographical characteristic
#The 5 areas are Ring 2-6
neighbourhood_simplified = case_when(
Ring == "2" ~ "Ring 2",
Ring == "3" ~ "Ring 3",
Ring == "4" ~ "Ring 4",
Ring == "5" ~ "Ring 5",
TRUE ~ "Ring 6"
)) %>%
subset(., select = -Ring)
regression_data## # A tibble: 33,497 x 33
## price cleaning_fee extra_people room_type property_type number_of_revie…
## <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 835 71 71 Entire h… Serviced apa… 99
## 2 1203 0 0 Private … Guest suite 2
## 3 602 0 0 Entire h… Apartment 66
## 4 602 30 0 Entire h… Apartment 10
## 5 411 71 106 Entire h… House 290
## 6 552 0 0 Entire h… Apartment 26
## 7 601 0 0 Entire h… Apartment 39
## 8 403 0 64 Entire h… Apartment 30
## 9 743 283 0 Entire h… Apartment 117
## 10 418 35 80 Entire h… Apartment 3
## # … with 33,487 more rows, and 27 more variables: review_scores_rating <dbl>,
## # longitude <dbl>, latitude <dbl>, neighbourhood <chr>, minimum_nights <dbl>,
## # guests_included <dbl>, bathrooms <dbl>, bedrooms <dbl>, beds <dbl>,
## # accommodates <dbl>, host_is_superhost <lgl>, neighbourhood_cleansed <chr>,
## # cancellation_policy <chr>, listing_url <chr>, is_location_exact <lgl>,
## # security_deposit <dbl>, review_scores_cleanliness <dbl>,
## # instant_bookable <lgl>, amenities <chr>,
## # calculated_host_listings_count <dbl>, reviews_per_month <dbl>, wifi <lgl>,
## # breakfast <lgl>, prop_type_simplified <chr>, price_for_4_notlog <dbl>,
## # price_4_nights <dbl>, neighbourhood_simplified <chr>
# ggplot for price of four nights
ggplot(data = regression_data, aes(x = price_for_4_notlog)) +
geom_histogram() +
xlim(0, 40000)# ggplot for log of price of four nights
ggplot(data = regression_data, aes(x = price_4_nights)) +
geom_density() # we use loggy-loggy to effectively change the case from a unit change to a percentage change
# look at cleaned data for regression models
glimpse(regression_data)## Rows: 33,497
## Columns: 33
## $ price <dbl> 835, 1203, 602, 602, 411, 552, 601, 40…
## $ cleaning_fee <dbl> 71, 0, 0, 30, 71, 0, 0, 0, 283, 35, 0,…
## $ extra_people <dbl> 71, 0, 0, 0, 106, 0, 0, 64, 0, 80, 63,…
## $ room_type <chr> "Entire home/apt", "Private room", "En…
## $ property_type <chr> "Serviced apartment", "Guest suite", "…
## $ number_of_reviews <dbl> 99, 2, 66, 10, 290, 26, 39, 30, 117, 3…
## $ review_scores_rating <dbl> 91, 100, 93, 98, 97, 77, 86, 83, 99, 1…
## $ longitude <dbl> 116, 117, 116, 116, 116, 116, 116, 116…
## $ latitude <dbl> 39.9, 40.7, 39.9, 39.9, 39.9, 39.9, 39…
## $ neighbourhood <chr> "Chaoyang", NA, "ITC", "Chaoyang", "Do…
## $ minimum_nights <dbl> 2, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 2,…
## $ guests_included <dbl> 6, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 4,…
## $ bathrooms <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1,…
## $ bedrooms <dbl> 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2,…
## $ beds <dbl> 4, 1, 1, 1, 2, 1, 2, 1, 3, 1, 1, 1, 2,…
## $ accommodates <dbl> 9, 2, 2, 2, 3, 2, 4, 2, 4, 3, 2, 2, 6,…
## $ host_is_superhost <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALS…
## $ neighbourhood_cleansed <chr> "朝阳区 / Chaoyang", "密云县 / Miyun", "朝阳区 …
## $ cancellation_policy <chr> "strict_14_with_grace_period", "strict…
## $ listing_url <chr> "https://www.airbnb.com/rooms/44054", …
## $ is_location_exact <lgl> TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, …
## $ security_deposit <dbl> 708, 0, 0, 0, 0, 0, 0, 700, 0, 1000, 9…
## $ review_scores_cleanliness <dbl> 8, 9, 9, 9, 10, 7, 7, 8, 10, 7, 7, 9, …
## $ instant_bookable <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, …
## $ amenities <chr> "{TV,\"Cable TV\",Internet,Wifi,\"Air …
## $ calculated_host_listings_count <dbl> 5, 4, 5, 5, 1, 5, 5, 6, 1, 8, 6, 8, 1,…
## $ reviews_per_month <dbl> 0.83, 0.06, 0.73, 0.11, 2.63, 0.24, 0.…
## $ wifi <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
## $ breakfast <lgl> FALSE, TRUE, FALSE, FALSE, TRUE, FALSE…
## $ prop_type_simplified <chr> "Other", "Other", "Apartment", "Apartm…
## $ price_for_4_notlog <dbl> 3411, 4812, 2408, 2438, 1715, 2208, 24…
## $ price_4_nights <dbl> 8.13, 8.48, 7.79, 7.80, 7.45, 7.70, 7.…
## $ neighbourhood_simplified <chr> "Ring 4", "Ring 6", "Ring 3", "Ring 4"…
# model 1 with a few variables - reviews and property types
model1 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating,
regression_data)
model1 %>% tidy(conf.int=TRUE) ## # A tibble: 7 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.91 0.0486 142. 0. 6.81 7.00
## 2 prop_type_simplifie… -0.0594 0.0165 -3.60 3.15e- 4 -0.0918 -0.0271
## 3 prop_type_simplifie… 0.204 0.0184 11.1 2.54e- 28 0.168 0.240
## 4 prop_type_simplifie… 0.101 0.0200 5.07 4.11e- 7 0.0621 0.140
## 5 prop_type_simplifie… 0.455 0.0148 30.7 1.47e-202 0.426 0.484
## 6 number_of_reviews -0.00204 0.000266 -7.66 1.88e- 14 -0.00256 -0.00152
## 7 review_scores_rating 0.00447 0.000508 8.80 1.45e- 18 0.00348 0.00547
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.067 | 0.067 | 0.762 | 223 | 0 | 6 | -21392 | 42800 | 42863 | 10832 | 18636 | 18643 |
Here, property type is a categorical variable - it has five categories and therefore makes up 4 dummy variables in the regression model. For example, the intercept term for ‘Apartment’ would just be ~ 6.91. For ‘House’, prop_type_simplifiedHouse = 1 (prop_type_simplifiedCondominium = 0 and prop_type_simplifiedOther = 0) and the intercept term would be 6.91 + 0.2 ~ 7.11. For ‘Other’, prop_type_simplifiedOther = 1 (prop_type_simplifiedCondominium = 0 and prop_type_simplifiedHouse = 0) and the intercept term would be 6.91 + 0.46 ~ 7.37. Therefore, relative to apartments, price_4_nights will be higher for houses and lofts but lower for condominiums.
(Note: our Y variable is in log, so the coefficient of all X variables represent percentage change in price_4_nights per unit change in whichever X variable we’re looking at)
Other variables such as number_of_reviews and review_scores_rating are statistically significant and explain the variation in price_4_nights, however, a point worth noting is that additional number_of_reviews do not lead to an increase in cost for 4 nights as the reviews may not necessarily be good reviews. On the other hand, review_scores_rating has a positive effect on price_4_nights which means that properties with a higher score/ rating would be more pricey.
# model 2 = model 1 + room type
model2 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating + room_type,
regression_data)
model2 %>% tidy(conf.int=TRUE) ## # A tibble: 9 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 7.12 0.0427 167. 0. 7.04 7.20
## 2 prop_type_simplified… -0.0321 0.0145 -2.22 2.63e- 2 -0.0605 -0.00378
## 3 prop_type_simplified… 0.277 0.0162 17.1 3.51e-65 0.245 0.309
## 4 prop_type_simplified… -0.0315 0.0176 -1.79 7.35e- 2 -0.0660 0.00300
## 5 prop_type_simplified… 0.530 0.0131 40.6 0. 0.504 0.555
## 6 number_of_reviews -0.00137 0.000234 -5.85 5.15e- 9 -0.00182 -0.000908
## 7 review_scores_rating 0.00480 0.000445 10.8 6.09e-27 0.00392 0.00567
## 8 room_typePrivate room -0.671 0.0109 -61.6 0. -0.692 -0.649
## 9 room_typeShared room -1.21 0.0232 -52.1 0. -1.26 -1.17
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.284 | 0.284 | 0.668 | 924 | 0 | 8 | -18924 | 37867 | 37946 | 8312 | 18634 | 18643 |
From the above table, we know that room_type has a very significant impact on price_4_nights as adjusted R-squared for model 2 is more than 4 times the adjusted R-squared for model 1. Room type is also a categorical variable with 3 categories, and hence makes up 2 dummy variables in the regression model.
We notice that the t-stat values for other variables which were already present in model 1, have further increased in model 2 indicating that there may be some multicollinearity between the variables. To check if that’s the case, we’ll calculate VIF.
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.04 4 1.01
## number_of_reviews 1.01 1 1.01
## review_scores_rating 1.01 1 1.00
## room_type 1.04 2 1.01
# creating a huxtable for summary of two models
huxreg(model1, model2,
statistics = c('#observations' = 'nobs',
'R squared' = 'r.squared',
'Adj. R Squared' = 'adj.r.squared',
'Residual SE' = 'sigma'),
bold_signif = 0.05,
stars = NULL
) %>%
set_caption('Comparison of Models 1.0')| (1) | (2) | |
|---|---|---|
| (Intercept) | 6.909 | 7.119 |
| (0.049) | (0.043) | |
| prop_type_simplifiedCondominium | -0.059 | -0.032 |
| (0.016) | (0.014) | |
| prop_type_simplifiedHouse | 0.204 | 0.277 |
| (0.018) | (0.016) | |
| prop_type_simplifiedLoft | 0.101 | -0.032 |
| (0.020) | (0.018) | |
| prop_type_simplifiedOther | 0.455 | 0.530 |
| (0.015) | (0.013) | |
| number_of_reviews | -0.002 | -0.001 |
| (0.000) | (0.000) | |
| review_scores_rating | 0.004 | 0.005 |
| (0.001) | (0.000) | |
| room_typePrivate room | -0.671 | |
| (0.011) | ||
| room_typeShared room | -1.211 | |
| (0.023) | ||
| #observations | 18643 | 18643 |
| R squared | 0.067 | 0.284 |
| Adj. R Squared | 0.067 | 0.284 |
| Residual SE | 0.762 | 0.668 |
Previously, we plotted a correlation matrix to see which variables can be added to our regression model.
# model 3 = model 2 + beds, baths, bedrooms and no. of guests property can accommodate
model3 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates,
regression_data
)
model3 %>% tidy(conf.int=TRUE)## # A tibble: 13 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.73 0.0371 181. 0. 6.65e+0 6.80e+0
## 2 prop_type_simplif… -0.0361 0.0124 -2.91 3.68e- 3 -6.04e-2 -1.17e-2
## 3 prop_type_simplif… 0.123 0.0140 8.78 1.77e- 18 9.57e-2 1.51e-1
## 4 prop_type_simplif… -0.0673 0.0152 -4.44 9.02e- 6 -9.71e-2 -3.76e-2
## 5 prop_type_simplif… 0.265 0.0117 22.6 1.06e-111 2.42e-1 2.87e-1
## 6 number_of_reviews -0.000362 0.000201 -1.80 7.13e- 2 -7.56e-4 3.14e-5
## 7 review_scores_rat… 0.00326 0.000385 8.46 2.79e- 17 2.50e-3 4.01e-3
## 8 room_typePrivate … -0.416 0.00999 -41.6 0. -4.35e-1 -3.96e-1
## 9 room_typeShared r… -0.919 0.0208 -44.2 0. -9.60e-1 -8.79e-1
## 10 bedrooms 0.0805 0.00723 11.1 1.21e- 28 6.63e-2 9.46e-2
## 11 bathrooms 0.0302 0.00428 7.05 1.88e- 12 2.18e-2 3.85e-2
## 12 beds -0.0308 0.00337 -9.15 6.53e- 20 -3.74e-2 -2.42e-2
## 13 accommodates 0.112 0.00306 36.6 8.29e-284 1.06e-1 1.18e-1
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.474 | 0.474 | 0.573 | 1395 | 0 | 12 | -15992 | 32012 | 32121 | 6086 | 18559 | 18572 |
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.15 4 1.02
## number_of_reviews 1.02 1 1.01
## review_scores_rating 1.01 1 1.01
## room_type 1.26 2 1.06
## bedrooms 4.39 1 2.10
## bathrooms 1.62 1 1.27
## beds 3.12 1 1.77
## accommodates 4.42 1 2.10
In the table above, we can see that VIF for bedrooms, beds and accommodates is high. It is not a problem as such since their VIF is still less than 5 but compared to other variables, higher VIF is expected because more the number of beds and bedrooms, higher the number of guests the property can accommodate. So there is some correlation between these variables.
Does price of a property vary significantly if host is a Superhost?
Superhosts are experienced hosts who are most dedicated to providing outstanding hospitality to their guests. They need to maintain certain standards in response rate, cancellation rate and overall rating to earn this badge. From that perspective, we hypothesize that other factors remaining constant, a Superhost will charge prices higher than the average host. Let’s see if that’s true.
# model5 = model 4 + superhost status
model5 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost,
regression_data
)
model5 %>% tidy(conf.int=TRUE)## # A tibble: 14 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.76e+0 0.0373 181. 0. 6.68 6.83
## 2 prop_type_simplifi… -3.80e-2 0.0124 -3.06 2.22e- 3 -0.0623 -0.0136
## 3 prop_type_simplifi… 1.25e-1 0.0140 8.90 6.09e- 19 0.0973 0.152
## 4 prop_type_simplifi… -7.02e-2 0.0152 -4.63 3.60e- 6 -0.0999 -0.0405
## 5 prop_type_simplifi… 2.66e-1 0.0117 22.8 3.67e-113 0.243 0.289
## 6 number_of_reviews -7.03e-4 0.000207 -3.40 6.79e- 4 -0.00111 -0.000298
## 7 review_scores_rati… 2.74e-3 0.000392 6.99 2.90e- 12 0.00197 0.00351
## 8 room_typePrivate r… -4.16e-1 0.00998 -41.7 0. -0.436 -0.397
## 9 room_typeShared ro… -9.15e-1 0.0208 -44.0 0. -0.956 -0.874
## 10 bedrooms 8.20e-2 0.00723 11.3 9.78e- 30 0.0679 0.0962
## 11 bathrooms 2.97e-2 0.00427 6.95 3.73e- 12 0.0213 0.0381
## 12 beds -3.07e-2 0.00337 -9.12 8.00e- 20 -0.0373 -0.0241
## 13 accommodates 1.12e-1 0.00306 36.6 9.70e-283 0.106 0.118
## 14 host_is_superhostT… 6.21e-2 0.00916 6.78 1.24e- 11 0.0441 0.0800
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.476 | 0.475 | 0.572 | 1295 | 0 | 13 | -15968 | 31966 | 32083 | 6070 | 18557 | 18571 |
Our hypothesis seems to be true;
host_is_superhostis significant as per its t-stat and p-value. One can expect the price for a Superhost’s property to be higher than an average host’s property by 0.062%
Is Location Exact?
Some hosts specify the exact location of their property; let’s see if that has any effect on the price for 4 nights.
model6 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact,
regression_data
)
model6 %>% tidy(conf.int=TRUE)## # A tibble: 15 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.82e+0 0.0379 180. 0. 6.74 6.89
## 2 prop_type_simplifi… -3.86e-2 0.0124 -3.12 1.83e- 3 -0.0629 -0.0143
## 3 prop_type_simplifi… 1.12e-1 0.0141 7.95 1.92e- 15 0.0842 0.139
## 4 prop_type_simplifi… -6.83e-2 0.0151 -4.51 6.37e- 6 -0.0979 -0.0386
## 5 prop_type_simplifi… 2.52e-1 0.0118 21.4 2.41e-100 0.229 0.275
## 6 number_of_reviews -8.97e-4 0.000208 -4.32 1.54e- 5 -0.00130 -0.000491
## 7 review_scores_rati… 2.70e-3 0.000391 6.90 5.24e- 12 0.00193 0.00347
## 8 room_typePrivate r… -4.24e-1 0.00999 -42.4 0. -0.444 -0.404
## 9 room_typeShared ro… -9.19e-1 0.0207 -44.3 0. -0.960 -0.878
## 10 bedrooms 8.04e-2 0.00722 11.1 1.03e- 28 0.0662 0.0945
## 11 bathrooms 2.85e-2 0.00427 6.68 2.48e- 11 0.0201 0.0369
## 12 beds -3.00e-2 0.00336 -8.94 4.08e- 19 -0.0366 -0.0235
## 13 accommodates 1.11e-1 0.00305 36.3 5.98e-279 0.105 0.117
## 14 host_is_superhostT… 6.74e-2 0.00915 7.37 1.83e- 13 0.0495 0.0854
## 15 is_location_exactT… -7.98e-2 0.00872 -9.15 6.30e- 20 -0.0969 -0.0627
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.478 | 0.478 | 0.571 | 1213 | 0 | 14 | -15926 | 31884 | 32010 | 6043 | 18556 | 18571 |
Well, the variable is_location_exact seems to be significant as per its t-stat and p-value however the negative coefficient is surprising. Maybe that has something to do - not with whether the location specified is exact, but with what the location is!
For this purpose, let us include neighbourhood location into our regression model. To make things simple, we created a new variable called neighbourhood_simplified which groups different listings into broader categories or rings.
# adding neighbourhood location
model7 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified,
regression_data
)
model7 %>% tidy(conf.int=TRUE)## # A tibble: 19 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.96 0.0386 181. 0. 6.89 7.04
## 2 prop_type_simplifi… -0.0335 0.0122 -2.74 6.18e- 3 -0.0574 -0.00951
## 3 prop_type_simplifi… 0.0995 0.0140 7.13 1.08e- 12 0.0721 0.127
## 4 prop_type_simplifi… -0.0268 0.0154 -1.74 8.15e- 2 -0.0570 0.00335
## 5 prop_type_simplifi… 0.264 0.0120 21.9 4.34e-105 0.240 0.288
## 6 number_of_reviews -0.00171 0.000208 -8.20 2.54e- 16 -0.00211 -0.00130
## 7 review_scores_rati… 0.00310 0.000387 8.03 1.05e- 15 0.00235 0.00386
## 8 room_typePrivate r… -0.422 0.00990 -42.6 0. -0.441 -0.402
## 9 room_typeShared ro… -0.934 0.0206 -45.3 0. -0.975 -0.894
## 10 bedrooms 0.0971 0.00716 13.6 1.16e- 41 0.0830 0.111
## 11 bathrooms 0.0355 0.00422 8.41 4.52e- 17 0.0272 0.0438
## 12 beds -0.0307 0.00331 -9.28 1.96e- 20 -0.0372 -0.0242
## 13 accommodates 0.106 0.00302 35.1 1.09e-261 0.100 0.112
## 14 host_is_superhostT… 0.0633 0.00903 7.01 2.50e- 12 0.0456 0.0810
## 15 is_location_exactT… -0.0792 0.00863 -9.17 5.04e- 20 -0.0961 -0.0623
## 16 neighbourhood_simp… -0.202 0.0146 -13.9 2.06e- 43 -0.231 -0.174
## 17 neighbourhood_simp… -0.184 0.0131 -14.1 1.14e- 44 -0.210 -0.158
## 18 neighbourhood_simp… -0.206 0.0374 -5.51 3.56e- 8 -0.280 -0.133
## 19 neighbourhood_simp… -0.297 0.0130 -22.8 2.29e-113 -0.322 -0.271
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.492 | 0.492 | 0.563 | 999 | 0 | 18 | -15668 | 31377 | 31533 | 5877 | 18552 | 18571 |
neighbourhood_simplified is a dummy variable as it has 5 categories - Ring 2, Ring 3, Ring 4, Ring 5 and Ring 6. PLEASE INTERPRET THIS MORE THANKS
With inclusion of these location variables, our adjusted R-squared has increased to 0.492. Let’s continue to improve our model further. From the perspective of a host who is setting prices in accordance with the time, money and effort he spends in managing the property, and from the perspective of a traveler who is booking the Airbnb and paying that price, some other variables worth considering are -
model8 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified + cancellation_policy,
regression_data
)
model8 %>% tidy(conf.int=TRUE)## # A tibble: 21 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.93 0.0387 179. 0. 6.86 7.01
## 2 prop_type_simplifi… -0.0366 0.0122 -3.00 2.74e- 3 -0.0605 -0.0126
## 3 prop_type_simplifi… 0.0985 0.0139 7.06 1.69e- 12 0.0712 0.126
## 4 prop_type_simplifi… -0.0289 0.0154 -1.88 6.07e- 2 -0.0590 0.00130
## 5 prop_type_simplifi… 0.265 0.0120 22.0 4.66e-106 0.241 0.288
## 6 number_of_reviews -0.00186 0.000209 -8.89 6.96e- 19 -0.00227 -0.00145
## 7 review_scores_rati… 0.00307 0.000386 7.95 2.04e- 15 0.00231 0.00383
## 8 room_typePrivate r… -0.420 0.00989 -42.5 0. -0.440 -0.401
## 9 room_typeShared ro… -0.933 0.0206 -45.3 0. -0.973 -0.892
## 10 bedrooms 0.0976 0.00715 13.6 3.72e- 42 0.0836 0.112
## # … with 11 more rows
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.494 | 0.493 | 0.562 | 904 | 0 | 20 | -15644 | 31332 | 31505 | 5862 | 18550 | 18571 |
Cancellation policy
model9 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified + review_scores_cleanliness,
regression_data
)
model9 %>% tidy(conf.int=TRUE)## # A tibble: 20 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.91 0.0410 168. 0. 6.83e+0 6.99
## 2 prop_type_simplif… -0.0335 0.0122 -2.75 6.04e- 3 -5.75e-2 -0.00960
## 3 prop_type_simplif… 0.0989 0.0140 7.09 1.42e- 12 7.16e-2 0.126
## 4 prop_type_simplif… -0.0274 0.0154 -1.78 7.55e- 2 -5.76e-2 0.00281
## 5 prop_type_simplif… 0.263 0.0120 21.8 4.17e-104 2.39e-1 0.286
## 6 number_of_reviews -0.00172 0.000208 -8.26 1.53e- 16 -2.13e-3 -0.00131
## 7 review_scores_rat… 0.00113 0.000622 1.81 6.98e- 2 -9.11e-5 0.00235
## 8 room_typePrivate … -0.422 0.00989 -42.7 0. -4.41e-1 -0.403
## 9 room_typeShared r… -0.931 0.0206 -45.1 0. -9.71e-1 -0.890
## 10 bedrooms 0.0971 0.00716 13.6 1.09e- 41 8.30e-2 0.111
## 11 bathrooms 0.0355 0.00422 8.42 4.05e- 17 2.73e-2 0.0438
## 12 beds -0.0307 0.00331 -9.26 2.36e- 20 -3.71e-2 -0.0242
## 13 accommodates 0.106 0.00302 35.2 2.39e-262 1.00e-1 0.112
## 14 host_is_superhost… 0.0605 0.00905 6.68 2.46e- 11 4.27e-2 0.0782
## 15 is_location_exact… -0.0796 0.00863 -9.22 3.26e- 20 -9.65e-2 -0.0627
## 16 neighbourhood_sim… -0.203 0.0146 -13.9 8.59e- 44 -2.32e-1 -0.175
## 17 neighbourhood_sim… -0.185 0.0131 -14.1 5.94e- 45 -2.10e-1 -0.159
## 18 neighbourhood_sim… -0.207 0.0374 -5.53 3.28e- 8 -2.80e-1 -0.134
## 19 neighbourhood_sim… -0.299 0.0130 -22.9 6.13e-115 -3.24e-1 -0.273
## 20 review_scores_cle… 0.0256 0.00635 4.03 5.55e- 5 1.32e-2 0.0381
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.493 | 0.492 | 0.563 | 948 | 0 | 19 | -15658 | 31357 | 31522 | 5871 | 18548 | 18568 |
Cleanliness score - significant, but AIC and BIC is higher compared to when we use cancellation policy
model10 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified + instant_bookable,
regression_data
)
model10 %>% tidy(conf.int=TRUE)## # A tibble: 20 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.96 0.0389 179. 0. 6.88 7.04
## 2 prop_type_simplifi… -0.0335 0.0122 -2.74 6.14e- 3 -0.0574 -0.00954
## 3 prop_type_simplifi… 0.0994 0.0140 7.12 1.11e- 12 0.0721 0.127
## 4 prop_type_simplifi… -0.0269 0.0154 -1.74 8.12e- 2 -0.0571 0.00333
## 5 prop_type_simplifi… 0.264 0.0121 21.9 6.77e-105 0.240 0.287
## 6 number_of_reviews -0.00170 0.000208 -8.18 3.04e- 16 -0.00211 -0.00130
## 7 review_scores_rati… 0.00311 0.000387 8.03 1.04e- 15 0.00235 0.00386
## 8 room_typePrivate r… -0.422 0.00991 -42.6 0. -0.441 -0.402
## 9 room_typeShared ro… -0.934 0.0207 -45.2 0. -0.974 -0.893
## 10 bedrooms 0.0971 0.00716 13.6 1.15e- 41 0.0830 0.111
## 11 bathrooms 0.0355 0.00422 8.40 4.64e- 17 0.0272 0.0437
## 12 beds -0.0307 0.00331 -9.28 1.93e- 20 -0.0372 -0.0242
## 13 accommodates 0.106 0.00302 35.1 1.26e-261 0.100 0.112
## 14 host_is_superhostT… 0.0632 0.00904 6.99 2.91e- 12 0.0454 0.0809
## 15 is_location_exactT… -0.0794 0.00865 -9.17 5.14e- 20 -0.0963 -0.0624
## 16 neighbourhood_simp… -0.203 0.0146 -13.9 2.08e- 43 -0.231 -0.174
## 17 neighbourhood_simp… -0.184 0.0131 -14.1 1.11e- 44 -0.210 -0.159
## 18 neighbourhood_simp… -0.206 0.0374 -5.51 3.55e- 8 -0.280 -0.133
## 19 neighbourhood_simp… -0.297 0.0130 -22.7 5.39e-113 -0.322 -0.271
## 20 instant_bookableTR… 0.00251 0.00891 0.282 7.78e- 1 -0.0150 0.0200
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.492 | 0.492 | 0.563 | 947 | 0 | 19 | -15668 | 31379 | 31543 | 5877 | 18551 | 18571 |
not significant. t stat is low
# using security deposit normally here
model11 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified + security_deposit,
regression_data
)
model11 %>% tidy(conf.int=TRUE)## # A tibble: 20 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.96e+0 0.0385 181. 0. 6.89e+0 7.04e+0
## 2 prop_type_simpli… -3.45e-2 0.0122 -2.83 4.63e- 3 -5.84e-2 -1.06e-2
## 3 prop_type_simpli… 9.95e-2 0.0139 7.14 9.79e- 13 7.22e-2 1.27e-1
## 4 prop_type_simpli… -2.60e-2 0.0154 -1.69 9.03e- 2 -5.62e-2 4.09e-3
## 5 prop_type_simpli… 2.63e-1 0.0120 21.9 7.48e-105 2.40e-1 2.87e-1
## 6 number_of_reviews -1.81e-3 0.000208 -8.72 3.07e- 18 -2.22e-3 -1.41e-3
## 7 review_scores_ra… 3.06e-3 0.000386 7.92 2.44e- 15 2.30e-3 3.82e-3
## 8 room_typePrivate… -4.19e-1 0.00988 -42.4 0. -4.38e-1 -3.99e-1
## 9 room_typeShared … -9.29e-1 0.0206 -45.2 0. -9.70e-1 -8.89e-1
## 10 bedrooms 9.68e-2 0.00715 13.5 1.33e- 41 8.28e-2 1.11e-1
## 11 bathrooms 3.54e-2 0.00421 8.41 4.48e- 17 2.72e-2 4.37e-2
## 12 beds -3.03e-2 0.00331 -9.16 5.57e- 20 -3.68e-2 -2.38e-2
## 13 accommodates 1.06e-1 0.00302 35.0 2.02e-260 9.98e-2 1.12e-1
## 14 host_is_superhos… 6.20e-2 0.00901 6.88 6.22e- 12 4.43e-2 7.97e-2
## 15 is_location_exac… -7.67e-2 0.00862 -8.89 6.65e- 19 -9.36e-2 -5.98e-2
## 16 neighbourhood_si… -2.03e-1 0.0146 -13.9 5.98e- 44 -2.32e-1 -1.75e-1
## 17 neighbourhood_si… -1.86e-1 0.0131 -14.2 9.84e- 46 -2.12e-1 -1.60e-1
## 18 neighbourhood_si… -2.08e-1 0.0374 -5.56 2.74e- 8 -2.81e-1 -1.34e-1
## 19 neighbourhood_si… -2.95e-1 0.0130 -22.7 1.43e-112 -3.20e-1 -2.69e-1
## 20 security_deposit 2.39e-5 0.00000273 8.78 1.80e- 18 1.86e-5 2.93e-5
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.494 | 0.494 | 0.562 | 955 | 0 | 19 | -15630 | 31302 | 31466 | 5853 | 18551 | 18571 |
# using log of security deposit instead as it is a highly skewed variable
model12 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified + log(security_deposit + 0.001),
regression_data
)
model12 %>% tidy(conf.int=TRUE)## # A tibble: 20 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 7.00 0.0386 182. 0. 6.93 7.08
## 2 prop_type_simplifi… -0.0362 0.0122 -2.97 2.94e- 3 -0.0601 -0.0124
## 3 prop_type_simplifi… 0.103 0.0139 7.43 1.10e- 13 0.0762 0.131
## 4 prop_type_simplifi… -0.0295 0.0154 -1.92 5.48e- 2 -0.0596 0.000608
## 5 prop_type_simplifi… 0.268 0.0120 22.3 9.48e-109 0.244 0.291
## 6 number_of_reviews -0.00190 0.000208 -9.11 8.91e- 20 -0.00230 -0.00149
## 7 review_scores_rati… 0.00291 0.000386 7.53 5.22e- 14 0.00215 0.00366
## 8 room_typePrivate r… -0.412 0.00989 -41.6 0. -0.431 -0.393
## 9 room_typeShared ro… -0.914 0.0206 -44.3 0. -0.954 -0.874
## 10 bedrooms 0.0960 0.00714 13.5 4.23e- 41 0.0820 0.110
## 11 bathrooms 0.0352 0.00421 8.36 6.69e- 17 0.0269 0.0434
## 12 beds -0.0301 0.00330 -9.12 8.45e- 20 -0.0366 -0.0236
## 13 accommodates 0.106 0.00301 35.0 1.54e-260 0.0997 0.112
## 14 host_is_superhostT… 0.0569 0.00901 6.31 2.81e- 10 0.0392 0.0746
## 15 is_location_exactT… -0.0729 0.00862 -8.46 2.87e- 17 -0.0898 -0.0560
## 16 neighbourhood_simp… -0.201 0.0146 -13.8 4.78e- 43 -0.229 -0.172
## 17 neighbourhood_simp… -0.187 0.0131 -14.3 3.22e- 46 -0.212 -0.161
## 18 neighbourhood_simp… -0.211 0.0373 -5.67 1.49e- 8 -0.284 -0.138
## 19 neighbourhood_simp… -0.290 0.0130 -22.3 6.19e-109 -0.315 -0.264
## 20 log(security_depos… 0.00831 0.000705 11.8 5.91e- 32 0.00693 0.00969
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.496 | 0.496 | 0.561 | 961 | 0 | 19 | -15599 | 31240 | 31404 | 5834 | 18551 | 18571 |
log is better because lower AIC and BIC, and higher adjusted R-squared
# amenities - try three models - just wifi, just breakfast, both wifi and breakfast
model13 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified + wifi,
regression_data
)
model13 %>% tidy(conf.int=TRUE)## # A tibble: 20 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.83 0.0462 148. 0. 6.74 6.92
## 2 prop_type_simplifi… -0.0345 0.0122 -2.83 4.69e- 3 -0.0585 -0.0106
## 3 prop_type_simplifi… 0.0994 0.0140 7.12 1.10e- 12 0.0720 0.127
## 4 prop_type_simplifi… -0.0284 0.0154 -1.85 6.50e- 2 -0.0586 0.00177
## 5 prop_type_simplifi… 0.264 0.0120 21.9 3.15e-105 0.240 0.288
## 6 number_of_reviews -0.00175 0.000208 -8.40 4.88e- 17 -0.00216 -0.00134
## 7 review_scores_rati… 0.00303 0.000387 7.83 5.16e- 15 0.00227 0.00379
## 8 room_typePrivate r… -0.422 0.00989 -42.7 0. -0.442 -0.403
## 9 room_typeShared ro… -0.933 0.0206 -45.3 0. -0.974 -0.893
## 10 bedrooms 0.0974 0.00716 13.6 5.09e- 42 0.0834 0.111
## 11 bathrooms 0.0354 0.00422 8.39 5.21e- 17 0.0271 0.0437
## 12 beds -0.0308 0.00331 -9.31 1.40e- 20 -0.0373 -0.0243
## 13 accommodates 0.106 0.00302 35.0 1.98e-260 0.100 0.112
## 14 host_is_superhostT… 0.0621 0.00903 6.87 6.45e- 12 0.0444 0.0797
## 15 is_location_exactT… -0.0788 0.00863 -9.13 7.57e- 20 -0.0957 -0.0619
## 16 neighbourhood_simp… -0.201 0.0146 -13.7 9.99e- 43 -0.229 -0.172
## 17 neighbourhood_simp… -0.184 0.0131 -14.0 1.54e- 44 -0.209 -0.158
## 18 neighbourhood_simp… -0.207 0.0374 -5.52 3.42e- 8 -0.280 -0.133
## 19 neighbourhood_simp… -0.295 0.0130 -22.7 3.50e-112 -0.320 -0.269
## 20 wifiTRUE 0.145 0.0277 5.23 1.74e- 7 0.0905 0.199
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.493 | 0.492 | 0.562 | 949 | 0 | 19 | -15655 | 31351 | 31516 | 5869 | 18551 | 18571 |
model14 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified + breakfast,
regression_data
)
model14 %>% tidy(conf.int=TRUE)## # A tibble: 20 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.98 0.0382 183. 0. 6.90 7.05
## 2 prop_type_simplifi… -0.0276 0.0121 -2.28 2.26e- 2 -0.0513 -0.00387
## 3 prop_type_simplifi… 0.0952 0.0138 6.88 5.99e- 12 0.0681 0.122
## 4 prop_type_simplifi… -0.0175 0.0153 -1.15 2.52e- 1 -0.0474 0.0124
## 5 prop_type_simplifi… 0.226 0.0121 18.7 2.73e- 77 0.202 0.250
## 6 number_of_reviews -0.00176 0.000206 -8.55 1.34e- 17 -0.00217 -0.00136
## 7 review_scores_rati… 0.00298 0.000383 7.77 8.55e- 15 0.00222 0.00373
## 8 room_typePrivate r… -0.449 0.00991 -45.3 0. -0.469 -0.430
## 9 room_typeShared ro… -0.958 0.0205 -46.8 0. -0.998 -0.918
## 10 bedrooms 0.0955 0.00709 13.5 3.84e- 41 0.0816 0.109
## 11 bathrooms 0.0320 0.00419 7.63 2.37e- 14 0.0238 0.0402
## 12 beds -0.0308 0.00328 -9.39 6.60e- 21 -0.0373 -0.0244
## 13 accommodates 0.105 0.00300 35.1 2.12e-261 0.0993 0.111
## 14 host_is_superhostT… 0.0634 0.00895 7.09 1.42e- 12 0.0459 0.0809
## 15 is_location_exactT… -0.0708 0.00857 -8.27 1.43e- 16 -0.0876 -0.0540
## 16 neighbourhood_simp… -0.201 0.0145 -13.9 1.58e- 43 -0.229 -0.172
## 17 neighbourhood_simp… -0.184 0.0130 -14.2 2.51e- 45 -0.209 -0.158
## 18 neighbourhood_simp… -0.206 0.0371 -5.57 2.65e- 8 -0.279 -0.134
## 19 neighbourhood_simp… -0.319 0.0129 -24.6 8.48e-132 -0.344 -0.293
## 20 breakfastTRUE 0.267 0.0142 18.9 1.40e- 78 0.239 0.295
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.502 | 0.501 | 0.558 | 983 | 0 | 19 | -15492 | 31026 | 31190 | 5767 | 18551 | 18571 |
model15 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified + wifi + breakfast,
regression_data
)
model15 %>% tidy(conf.int=TRUE)## # A tibble: 21 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.86 0.0458 150. 0. 6.77 6.95
## 2 prop_type_simplifie… -0.0286 0.0121 -2.36 1.80e- 2 -0.0523 -0.00490
## 3 prop_type_simplifie… 0.0952 0.0138 6.88 6.04e-12 0.0681 0.122
## 4 prop_type_simplifie… -0.0190 0.0153 -1.24 2.14e- 1 -0.0489 0.0109
## 5 prop_type_simplifie… 0.226 0.0121 18.7 1.52e-77 0.203 0.250
## 6 number_of_reviews -0.00180 0.000206 -8.73 2.88e-18 -0.00220 -0.00140
## 7 review_scores_rating 0.00291 0.000383 7.59 3.44e-14 0.00216 0.00366
## 8 room_typePrivate ro… -0.449 0.00990 -45.4 0. -0.469 -0.430
## 9 room_typeShared room -0.958 0.0205 -46.8 0. -0.998 -0.918
## 10 bedrooms 0.0959 0.00709 13.5 1.80e-41 0.0820 0.110
## # … with 11 more rows
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.502 | 0.502 | 0.557 | 936 | 0 | 20 | -15481 | 31005 | 31178 | 5760 | 18550 | 18571 |
# checking other review scores
model16 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified + reviews_per_month,
regression_data
)
model16 %>% tidy(conf.int=TRUE)## # A tibble: 20 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.96e+0 0.0385 181. 0. 6.88 7.03
## 2 prop_type_simplifi… -3.18e-2 0.0122 -2.60 9.22e- 3 -0.0558 -0.00787
## 3 prop_type_simplifi… 1.01e-1 0.0140 7.21 5.66e- 13 0.0733 0.128
## 4 prop_type_simplifi… -2.23e-2 0.0154 -1.45 1.48e- 1 -0.0525 0.00794
## 5 prop_type_simplifi… 2.64e-1 0.0120 21.9 2.58e-105 0.240 0.288
## 6 number_of_reviews -3.83e-4 0.000344 -1.11 2.65e- 1 -0.00106 0.000291
## 7 review_scores_rati… 3.21e-3 0.000387 8.28 1.28e- 16 0.00245 0.00396
## 8 room_typePrivate r… -4.25e-1 0.00991 -42.9 0. -0.444 -0.406
## 9 room_typeShared ro… -9.43e-1 0.0207 -45.6 0. -0.983 -0.902
## 10 bedrooms 9.62e-2 0.00716 13.4 5.75e- 41 0.0822 0.110
## 11 bathrooms 3.63e-2 0.00422 8.59 9.21e- 18 0.0280 0.0445
## 12 beds -3.08e-2 0.00331 -9.30 1.53e- 20 -0.0373 -0.0243
## 13 accommodates 1.06e-1 0.00302 35.1 1.59e-261 0.100 0.112
## 14 host_is_superhostT… 7.42e-2 0.00930 7.98 1.57e- 15 0.0560 0.0925
## 15 is_location_exactT… -7.28e-2 0.00873 -8.33 8.43e- 17 -0.0899 -0.0556
## 16 neighbourhood_simp… -2.01e-1 0.0146 -13.8 5.36e- 43 -0.230 -0.173
## 17 neighbourhood_simp… -1.85e-1 0.0131 -14.1 4.88e- 45 -0.211 -0.159
## 18 neighbourhood_simp… -2.08e-1 0.0374 -5.57 2.58e- 8 -0.282 -0.135
## 19 neighbourhood_simp… -2.99e-1 0.0130 -23.0 5.67e-115 -0.324 -0.273
## 20 reviews_per_month -4.25e-2 0.00877 -4.84 1.28e- 6 -0.0597 -0.0253
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.493 | 0.492 | 0.563 | 949 | 0 | 19 | -15657 | 31355 | 31520 | 5870 | 18551 | 18571 |
model17 <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified + calculated_host_listings_count,
regression_data
)
model17 %>% tidy(conf.int=TRUE)## # A tibble: 20 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.93 0.0396 175. 0. 6.85e+0 7.01
## 2 prop_type_simplifi… -0.0308 0.0122 -2.52 1.18e- 2 -5.48e-2 -0.00682
## 3 prop_type_simplifi… 0.103 0.0140 7.36 1.97e- 13 7.55e-2 0.130
## 4 prop_type_simplifi… -0.0279 0.0154 -1.81 7.00e- 2 -5.81e-2 0.00229
## 5 prop_type_simplifi… 0.266 0.0121 22.1 9.84e-107 2.43e-1 0.290
## 6 number_of_reviews -0.00164 0.000209 -7.86 4.11e- 15 -2.05e-3 -0.00123
## 7 review_scores_rati… 0.00327 0.000389 8.40 4.69e- 17 2.51e-3 0.00403
## 8 room_typePrivate r… -0.419 0.00991 -42.3 0. -4.39e-1 -0.400
## 9 room_typeShared ro… -0.931 0.0206 -45.2 0. -9.72e-1 -0.891
## 10 bedrooms 0.0976 0.00716 13.6 3.97e- 42 8.36e-2 0.112
## 11 bathrooms 0.0360 0.00422 8.53 1.53e- 17 2.78e-2 0.0443
## 12 beds -0.0305 0.00331 -9.20 3.93e- 20 -3.70e-2 -0.0240
## 13 accommodates 0.106 0.00303 34.9 3.36e-258 9.97e-2 0.112
## 14 host_is_superhostT… 0.0641 0.00903 7.09 1.35e- 12 4.64e-2 0.0818
## 15 is_location_exactT… -0.0815 0.00865 -9.42 5.22e- 21 -9.85e-2 -0.0645
## 16 neighbourhood_simp… -0.199 0.0146 -13.6 7.72e- 42 -2.28e-1 -0.170
## 17 neighbourhood_simp… -0.180 0.0131 -13.7 1.47e- 42 -2.06e-1 -0.154
## 18 neighbourhood_simp… -0.205 0.0374 -5.47 4.62e- 8 -2.78e-1 -0.131
## 19 neighbourhood_simp… -0.292 0.0131 -22.3 3.36e-109 -3.18e-1 -0.266
## 20 calculated_host_li… 0.00134 0.000365 3.66 2.53e- 4 6.20e-4 0.00205
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.493 | 0.492 | 0.563 | 948 | 0 | 19 | -15662 | 31365 | 31530 | 5873 | 18551 | 18571 |
# summary table to compare last few models
huxreg(model8, model9, model10, model11, model12, model13, model14, model15,
statistics = c('#observations' = 'nobs',
'R squared' = 'r.squared',
'Adj. R Squared' = 'adj.r.squared',
'Residual SE' = 'sigma'),
bold_signif = 0.05,
stars = NULL
) %>%
set_caption('Comparison of Models 3.0')| (1) | (2) | (3) | (4) | (5) | (6) | (7) | (8) | |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | 6.935 | 6.908 | 6.961 | 6.961 | 7.005 | 6.829 | 6.977 | 6.857 |
| (0.039) | (0.041) | (0.039) | (0.038) | (0.039) | (0.046) | (0.038) | (0.046) | |
| prop_type_simplifiedCondominium | -0.037 | -0.034 | -0.033 | -0.035 | -0.036 | -0.035 | -0.028 | -0.029 |
| (0.012) | (0.012) | (0.012) | (0.012) | (0.012) | (0.012) | (0.012) | (0.012) | |
| prop_type_simplifiedHouse | 0.098 | 0.099 | 0.099 | 0.099 | 0.103 | 0.099 | 0.095 | 0.095 |
| (0.014) | (0.014) | (0.014) | (0.014) | (0.014) | (0.014) | (0.014) | (0.014) | |
| prop_type_simplifiedLoft | -0.029 | -0.027 | -0.027 | -0.026 | -0.029 | -0.028 | -0.018 | -0.019 |
| (0.015) | (0.015) | (0.015) | (0.015) | (0.015) | (0.015) | (0.015) | (0.015) | |
| prop_type_simplifiedOther | 0.265 | 0.263 | 0.264 | 0.263 | 0.268 | 0.264 | 0.226 | 0.226 |
| (0.012) | (0.012) | (0.012) | (0.012) | (0.012) | (0.012) | (0.012) | (0.012) | |
| number_of_reviews | -0.002 | -0.002 | -0.002 | -0.002 | -0.002 | -0.002 | -0.002 | -0.002 |
| (0.000) | (0.000) | (0.000) | (0.000) | (0.000) | (0.000) | (0.000) | (0.000) | |
| review_scores_rating | 0.003 | 0.001 | 0.003 | 0.003 | 0.003 | 0.003 | 0.003 | 0.003 |
| (0.000) | (0.001) | (0.000) | (0.000) | (0.000) | (0.000) | (0.000) | (0.000) | |
| room_typePrivate room | -0.420 | -0.422 | -0.422 | -0.419 | -0.412 | -0.422 | -0.449 | -0.449 |
| (0.010) | (0.010) | (0.010) | (0.010) | (0.010) | (0.010) | (0.010) | (0.010) | |
| room_typeShared room | -0.933 | -0.931 | -0.934 | -0.929 | -0.914 | -0.933 | -0.958 | -0.958 |
| (0.021) | (0.021) | (0.021) | (0.021) | (0.021) | (0.021) | (0.020) | (0.020) | |
| bedrooms | 0.098 | 0.097 | 0.097 | 0.097 | 0.096 | 0.097 | 0.096 | 0.096 |
| (0.007) | (0.007) | (0.007) | (0.007) | (0.007) | (0.007) | (0.007) | (0.007) | |
| bathrooms | 0.035 | 0.036 | 0.035 | 0.035 | 0.035 | 0.035 | 0.032 | 0.032 |
| (0.004) | (0.004) | (0.004) | (0.004) | (0.004) | (0.004) | (0.004) | (0.004) | |
| beds | -0.030 | -0.031 | -0.031 | -0.030 | -0.030 | -0.031 | -0.031 | -0.031 |
| (0.003) | (0.003) | (0.003) | (0.003) | (0.003) | (0.003) | (0.003) | (0.003) | |
| accommodates | 0.105 | 0.106 | 0.106 | 0.106 | 0.106 | 0.106 | 0.105 | 0.105 |
| (0.003) | (0.003) | (0.003) | (0.003) | (0.003) | (0.003) | (0.003) | (0.003) | |
| host_is_superhostTRUE | 0.055 | 0.060 | 0.063 | 0.062 | 0.057 | 0.062 | 0.063 | 0.062 |
| (0.009) | (0.009) | (0.009) | (0.009) | (0.009) | (0.009) | (0.009) | (0.009) | |
| is_location_exactTRUE | -0.077 | -0.080 | -0.079 | -0.077 | -0.073 | -0.079 | -0.071 | -0.070 |
| (0.009) | (0.009) | (0.009) | (0.009) | (0.009) | (0.009) | (0.009) | (0.009) | |
| neighbourhood_simplifiedRing 3 | -0.199 | -0.203 | -0.203 | -0.203 | -0.201 | -0.201 | -0.201 | -0.199 |
| (0.015) | (0.015) | (0.015) | (0.015) | (0.015) | (0.015) | (0.014) | (0.014) | |
| neighbourhood_simplifiedRing 4 | -0.182 | -0.185 | -0.184 | -0.186 | -0.187 | -0.184 | -0.184 | -0.183 |
| (0.013) | (0.013) | (0.013) | (0.013) | (0.013) | (0.013) | (0.013) | (0.013) | |
| neighbourhood_simplifiedRing 5 | -0.204 | -0.207 | -0.206 | -0.208 | -0.211 | -0.207 | -0.206 | -0.206 |
| (0.037) | (0.037) | (0.037) | (0.037) | (0.037) | (0.037) | (0.037) | (0.037) | |
| neighbourhood_simplifiedRing 6 | -0.288 | -0.299 | -0.297 | -0.295 | -0.290 | -0.295 | -0.319 | -0.317 |
| (0.013) | (0.013) | (0.013) | (0.013) | (0.013) | (0.013) | (0.013) | (0.013) | |
| cancellation_policymoderate | 0.055 | |||||||
| (0.010) | ||||||||
| cancellation_policystrict_14_with_grace_period | 0.066 | |||||||
| (0.011) | ||||||||
| review_scores_cleanliness | 0.026 | |||||||
| (0.006) | ||||||||
| instant_bookableTRUE | 0.003 | |||||||
| (0.009) | ||||||||
| security_deposit | 0.000 | |||||||
| (0.000) | ||||||||
| log(security_deposit + 0.001) | 0.008 | |||||||
| (0.001) | ||||||||
| wifiTRUE | 0.145 | 0.131 | ||||||
| (0.028) | (0.027) | |||||||
| breakfastTRUE | 0.267 | 0.265 | ||||||
| (0.014) | (0.014) | |||||||
| #observations | 18571 | 18568 | 18571 | 18571 | 18571 | 18571 | 18571 | 18571 |
| R squared | 0.494 | 0.493 | 0.492 | 0.494 | 0.496 | 0.493 | 0.502 | 0.502 |
| Adj. R Squared | 0.493 | 0.492 | 0.492 | 0.494 | 0.496 | 0.492 | 0.501 | 0.502 |
| Residual SE | 0.562 | 0.563 | 0.563 | 0.562 | 0.561 | 0.562 | 0.558 | 0.557 |
########### https://www.displayr.com/variance-inflation-factors-vifs/ USE THIS TO EXPLAIN - ex: beds/baths/accommodates - but none of the VIFs is high enough to suggest collinearity so we're goodConclusion : should definitely include
log(security_deposit)
final_model <- lm(price_4_nights ~
prop_type_simplified + number_of_reviews + review_scores_rating +
room_type + bedrooms + beds + bathrooms + accommodates + host_is_superhost +
is_location_exact + neighbourhood_simplified +
cancellation_policy + log(security_deposit + 0.001) +
wifi + breakfast,
regression_data
)
final_model %>% tidy(conf.int=TRUE)## # A tibble: 24 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.88 0.0459 150. 0. 6.79 6.97
## 2 prop_type_simplifie… -0.0335 0.0121 -2.78 5.40e- 3 -0.0572 -0.00992
## 3 prop_type_simplifie… 0.0981 0.0138 7.12 1.08e-12 0.0711 0.125
## 4 prop_type_simplifie… -0.0230 0.0152 -1.51 1.31e- 1 -0.0528 0.00683
## 5 prop_type_simplifie… 0.230 0.0120 19.1 7.04e-81 0.207 0.254
## 6 number_of_reviews -0.00209 0.000207 -10.1 6.28e-24 -0.00249 -0.00168
## 7 review_scores_rating 0.00270 0.000382 7.06 1.74e-12 0.00195 0.00344
## 8 room_typePrivate ro… -0.439 0.00989 -44.4 0. -0.459 -0.420
## 9 room_typeShared room -0.938 0.0204 -45.9 0. -0.978 -0.898
## 10 bedrooms 0.0953 0.00706 13.5 2.45e-41 0.0815 0.109
## # … with 14 more rows
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.507 | 0.506 | 0.555 | 829 | 0 | 23 | -15394 | 30839 | 31034 | 5707 | 18547 | 18571 |
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.42 4 1.04
## number_of_reviews 1.15 1 1.07
## review_scores_rating 1.06 1 1.03
## room_type 1.34 2 1.08
## bedrooms 4.46 1 2.11
## beds 3.12 1 1.77
## bathrooms 1.64 1 1.28
## accommodates 4.50 1 2.12
## host_is_superhost 1.15 1 1.07
## is_location_exact 1.09 1 1.04
## neighbourhood_simplified 1.40 4 1.04
## cancellation_policy 1.11 2 1.03
## log(security_deposit + 0.001) 1.07 1 1.04
## wifi 1.01 1 1.00
## breakfast 1.15 1 1.07
reading_week <- regression_data %>% filter(prop_type_simplified==“Apartment”, room_type==“Private room”, number_of_reviews >=10, review_scores_rating >=90)
reading_week
set.seed(6789)
train_test_split <- initial_split(reading_week, prop=0.75) reading_week_train <- training(train_test_split) reading_week_test <- testing(train_test_split)
rmse_train <- reading_week_train %>% mutate( predictions = predict(model1, .) ) %>% summarise( sqrt(sum(predictions - price_4_nights)**2/n())) %>% pull()
rmse_train
rmse_test <- reading_week_test %>% mutate(predictions = predict(model1, .)) %>% summarise( sqrt(sum(predictions - price_4_nights)**2/n())) %>% pull()
rmse_test
```